MAX_LEN = 10
lines=open('./cmn-eng/cmn.txt',encoding='utf-8').read().strip().split('\n')
words_re = re.compile(r'\w+')
pairs = []
for l in lines:
    en_sent, cn_sent, _ = l.split('\t')
    pairs.append((words_re.findall(en_sent.lower()),list(cn_sent)))
# 创建较小的数据集以加快演示过程
filtered_pairs = []
for x in pairs:
    if len(x[0]) < MAX_LEN and len(x[1]) < MAX_LEN and \
            x[0][0] in ('i','you','he','she','we','they'):
        filtered_pairs.append(x)
print(len(filtered_pairs)) 
for x in filtered_pairs[:10]: 
     print(x)   
